city = 'barcelona'
month = '201909'
filename_in = 'src/data/' + city + '-' + month + '-listings-CLEAN.csv'
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import graphviz
from sklearn.tree import export_graphviz
import featuretools as ft
import uuid
import s2sphere as s2
import random
import catboost as cb
from kmodes.kmodes import KModes
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import scipy.spatial as spatial
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
%run src/utils.py
coefs = {}
metrics = {}
def collect_results(columns, model, method, r2, mae, mse, skip_coef=True):
# coefs
if skip_coef != True:
method_coefs = {}
if hasattr(model, '__intercept'):
method_coefs['__intercept'] = model.intercept_
for i in range(len(columns.values)):
method_coefs[columns.values[i]] = abs(model.coef_[i])
coefs[method] = method_coefs
df_coefs = pd.DataFrame(coefs)
df_coefs = df_coefs.sort_values(by=method, ascending=False)
display(df_coefs)
# metrics
metrics[method] = {
'R2':r2.round(3),
'MAE':mae.round(3),
'MSE':mse.round(3)
}
display(pd.DataFrame(metrics))
def print_feature_importances(method, importances, df):
feature_score = pd.DataFrame(list(zip(df.dtypes.index, importances)), columns=['Feature','Score'])
feature_score = feature_score.sort_values(by='Score',
ascending=True,
inplace=False,
kind='quicksort',
na_position='last')
fig = go.Figure(
go.Bar(
x=feature_score['Score'],
y=feature_score['Feature'],
orientation='h'
)
)
fig.update_layout(
title=method + " Feature Importance Ranking",
height=25*len(feature_score)
)
fig.show()
df = pd.read_csv(filename_in)
df.info()
useful_cols = [
'accommodates',
'bathrooms',
'bedrooms',
'cancellation_policy',
'cleaning_fee',
'extra_people',
'guests_included',
'has_air_conditioning',
'has_bed_linens',
'has_coffee_maker',
'has_cooking_basics',
'has_dishes_and_silverware',
'has_elevator',
'has_essentials',
'has_family/kid_friendly',
'has_first_aid_kit',
'has_hair_dryer',
'has_hangers',
'has_heating',
'has_hot_water',
'has_iron',
'has_kitchen',
'has_laptop_friendly_workspace',
'has_license',
'has_long_term_stays_allowed',
'has_microwave',
'has_no_stairs_or_steps_to_enter',
'has_oven',
'has_refrigerator',
'has_shampoo',
'has_stove',
'has_tv',
'has_washer',
'has_wifi',
'instant_bookable',
'latitude',
'longitude',
'maximum_nights_avg_ntm',
'minimum_nights_avg_ntm',
'neighbourhood',
'price',
'property_type',
'room_type',
'security_deposit'
]
useless_cols = [
'district',
'income_med_occupation',
'activity_months',
'host_response_time',
'first_review',
'last_review',
'number_of_reviews',
'number_of_reviews_ltm',
'review_scores_rating',
'review_scores_accuracy',
'review_scores_cleanliness',
'review_scores_checkin',
'review_scores_communication',
'review_scores_location',
'review_scores_value',
'reviews_per_month'
]
highly_corr_cols = [
'has_refrigerator',
'host_verified_by_selfie'
]
df.drop([*useless_cols, *highly_corr_cols], axis=1, errors='ignore', inplace=True)
df.shape
Se calcula para cada propiedad la distancia en kilómetros a diferentes puntos de interés turÃstico de la ciudad.
pois = [
{'name':'sants', 'coord':(41.37416517, 2.13749945)},
{'name':'plaça-catalunya', 'coord':(41.387016, 2.170047)},
{'name':'plaça-espanya', 'coord':(41.375148, 2.148426)},
{'name':'estadio-camp-nou', 'coord':(41.380898, 2.122820)},
{'name':'barceloneta', 'coord':(41.380894, 2.189385)},
{'name':'montjuic', 'coord':(41.363485, 2.152609)},
{'name':'parc-guell', 'coord':(41.4082, 2.1517)},
{'name':'sagrada-familia', 'coord':(41.4036299, 2.1743558)},
{'name':'casa-batllo', 'coord':(41.391640, 2.164770)},
{'name':'drassanes-ramblas', 'coord':(41.376667, 2.175556)},
{'name':'plaça-reial', 'coord':(41.380192, 2.175515)},
{'name':'aeropuerto-prat', 'coord':(41.2974, 2.0833)}
]
for poi in pois:
df['dist_' + poi['name']] = df.apply(
lambda r: get_haversine_distance(
r['latitude'],
r['longitude'],
poi['coord']),
axis=1)
La caracterÃstica neighbourhood tiene una cardinalidad muy alta que puede conducir a sobreajuste puesto que en algunos barrios hay pocos datos. Se propone, utilizando clusterización, una caracterÃstica de cardinalidad intermedia entre barrios y distritos que agrupe barrios similares y que resulte más representativa para el estudio.
km = KModes(n_clusters=15, init='Huang', n_init=10, random_state=42)
df['nb_cluster'] = km.fit_predict(df[['price_med_occupation_per_accommodate', 'neighbourhood']])
clusters = df['nb_cluster'].copy()
df['nb_cluster'] = df['nb_cluster'].apply(lambda x: 'nb_' + str(x))
df.drop(['price_med_occupation_per_accommodate'], axis=1, inplace=True) # solo era para calcular clusters
cluster_map = pd.DataFrame(list(zip(df['neighbourhood'], clusters)), columns=['nb', 'cluster'])
cluster_map.drop_duplicates(inplace=True)
with open('src/geo/' + city + '.neighbourhoods.geojson') as f:
city_nb = fix_geojson(json.load(f))
fig = go.Figure(go.Choroplethmapbox(
geojson=city_nb,
locations=cluster_map['nb'],
z=cluster_map['cluster'],
colorscale=px.colors.qualitative.Vivid,
marker_opacity=0.5,
marker_line_width=0.2
))
fig.update_layout(
mapbox_style='carto-positron',
mapbox_zoom=11,
mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
margin={"r":0,"t":0,"l":0,"b":0},
title='clusters',
showlegend=False
)
fig.show()
def get_s2(lat, lng):
py_cellid = s2.CellId.from_lat_lng(
s2.LatLng.from_degrees(lat, lng)
)
py_cellid = py_cellid.parent(12)
return 's2_' + str(py_cellid.id())
df['s2'] = df.apply(lambda r: get_s2(r['latitude'], r['longitude']), axis=1)
df_s2 = df[['s2', 'latitude', 'longitude']]
s2_cells = sorted(df_s2['s2'].unique())
random.shuffle(s2_cells)
df_s2['idx'] = df_s2['s2'].apply(lambda x: s2_cells.index(x))
fig314 = go.Figure()
fig314.add_trace(go.Scattermapbox(
lon=df_s2['longitude'],
lat=df_s2['latitude'],
mode='markers',
marker_color=df_s2['idx'],
text=df_s2['idx'],
marker=dict(
size=5,
opacity=0.4,
colorscale='spectral'
)
))
fig314.update_layout(
showlegend=False,
mapbox_style='carto-positron',
mapbox_zoom=11,
mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
margin={"r":0,"t":0,"l":0,"b":0}
)
fig314.show()
poi_coords = list(map(lambda x: x['coord'], pois))
vor = spatial.Voronoi(poi_coords)
def get_voronoi_index(row):
new_point = [row['latitude'], row['longitude']]
point_index = np.argmin(np.sum((vor.points - new_point)**2, axis=1))
return 'v_' + str(point_index)
df['voronoi'] = df.apply(lambda r: get_voronoi_index(r), axis=1)
spatial.voronoi_plot_2d(vor)
df_voronoi = df[['voronoi', 'latitude', 'longitude']]
voronoi_cells = sorted(df_voronoi['voronoi'].unique())
df_voronoi['idx'] = df_voronoi['voronoi'].apply(lambda x: voronoi_cells.index(x))
fig315 = go.Figure()
fig315.add_trace(go.Scattermapbox(
lon=df_voronoi['longitude'],
lat=df_voronoi['latitude'],
mode='markers',
marker_color=df_voronoi['idx'],
text=df_voronoi['idx'],
marker=dict(
size=5,
opacity=0.4,
colorscale='spectral'
)
))
fig315.add_trace(
go.Scattermapbox(
lat=list(map(lambda x: x['coord'][0], pois)),
lon=list(map(lambda x: x['coord'][1], pois)),
text=list(map(lambda x: x['name'], pois)),
mode='markers',
marker=dict(
size=8,
opacity=0.9,
color='black'
)
)
)
fig315.update_layout(
showlegend=False,
mapbox_style='carto-positron',
mapbox_zoom=11,
mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
margin={"r":0,"t":0,"l":0,"b":0}
)
fig315.show()
print(df.shape)
dfd = pd.get_dummies(df)
print(dfd.shape)
target = 'price'
features = list(dfd.columns)
features.remove(target)
x_train, x_test, y_train, y_test = train_test_split(
dfd[features],
dfd[target],
test_size=0.3,
random_state=42
)
x_train = x_train.astype(float) # prevent conversion warnings
def eval_model(method, cols, df):
model = cb.CatBoostRegressor(
verbose=0,
random_seed=42,
depth=10,
iterations=150,
learning_rate=0.1
)
regressor = Pipeline([('model', model)])
regressor.fit(x_train[cols], y_train)
y_pred = regressor.predict(x_test[cols])
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
collect_results(cols, model, method, r2, mae, mse, skip_coef=True)
importances = regressor.named_steps['model'].feature_importances_
print_feature_importances(method, importances, df[cols])
"""
tree_data = export_graphviz(
regressor.named_steps['model'].estimators_[5],
feature_names=cols,
filled=True
)
display(graphviz.Source(tree_data))
"""
return y_pred
neighbourhood_cols = [col for col in dfd if col.startswith('neighbourhood')]
dist_cols = [col for col in dfd if col.startswith('dist_')]
coord_cols = ['latitude', 'longitude']
nb_cluster_cols = [col for col in dfd if col.startswith('nb_cluster_')]
s2_cols = [col for col in dfd if col.startswith('s2_')]
voronoi_cols = [col for col in dfd if col.startswith('voronoi')]
Este modelo registrarÃa toda la variabilidad de precio que es debida a las propiedades de las viviendas sin considerar caractarÃsticas geográficas de ningún tipo.
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('NO-GEO', cols, dfd)
Se busca si existen zonas con un error positivo o negativo.
x_test['resid'] = y_test - y_pred
plt.hist(x_test['resid'], bins=50)
plt.show()
x_test2 = x_test.copy()
x_test2.reset_index(inplace=True)
outliers_idx = get_outliers_iqr(x_test2['resid'])[0]
remove_outliers(x_test2, outliers_idx, 'resid')
plt.hist(x_test2['resid'], bins=30)
plt.show()
fig1 = go.Figure(
go.Scattermapbox(
lon=x_test2['longitude'],
lat=x_test2['latitude'],
mode='markers',
marker_color=x_test2['resid'],
text=x_test2['resid'],
marker=dict(
opacity=0.8,
colorscale=[
[0.0, "rgb(165,0,38)"],
[0.11, "rgb(215,48,39)"],
[0.22, "rgb(244,109,67)"],
[0.33, "rgb(253,174,97)"],
[0.44, "rgb(254,224,144)"],
[0.55, "rgb(224,243,248)"],
[0.66, "rgb(171,217,233)"],
[0.77, "rgb(116,173,209)"],
[0.88, "rgb(69,117,180)"],
[1.0, "rgb(49,54,149)"]
]
)
)
)
fig1.update_layout(
mapbox_style='carto-positron',
mapbox_zoom=11,
mapbox_center={'lat':x_test2['latitude'].mean(), 'lon':x_test2['longitude'].mean()},
margin={"r":0,"t":0,"l":0,"b":0}
)
fig1.show()
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('COORD', cols, dfd)
cols = features.copy()
for c in [*dist_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('NB', cols, dfd)
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *coord_cols, *s2_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('CLUSTER-NB', cols, dfd)
cols = features.copy()
for c in [*neighbourhood_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('DIST', cols, dfd)
cols = features.copy()
for c in [*neighbourhood_cols, *nb_cluster_cols, *coord_cols, *dist_cols, *s2_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('VORONOI', cols, dfd)
cols = features.copy()
for c in [*neighbourhood_cols, *nb_cluster_cols, *coord_cols, *dist_cols, *voronoi_cols]:
if c in cols:
cols.remove(c)
y_pred = eval_model('S2', cols, dfd)
df_us = df.copy().sample(frac=1, random_state=42)
df_us.shape
def get_max_sample_size(df):
df_by_cluster = df_us.groupby(['nb_cluster'])['nb_cluster'].count().to_frame('count')
df_by_cluster.reset_index(inplace=True)
return min(df_by_cluster['count'])
def print_cluster_dist(df):
df_by_cluster = df.groupby(['nb_cluster'])['nb_cluster'].count().to_frame('count')
df_by_cluster.reset_index(inplace=True)
df_by_cluster.sort_values(by=['count'], inplace=True)
fig = px.bar(df_by_cluster, x='nb_cluster', y='count')
fig.show()
print_cluster_dist(df_us)
for c in [13, 8, 14, 12, 4, 7, 5]:
val = 'nb_' + str(c)
df_us['nb_cluster'][df_us['nb_cluster'] == val] = 'nb_other'
print_cluster_dist(df_us)
sample_size = get_max_sample_size(df_us)
print(sample_size)
parts = []
for c in np.unique(df_us['nb_cluster']):
x_tmp = df_us.loc[df_us['nb_cluster'] == c].sample(n=sample_size, random_state=42)
parts.append(x_tmp)
df_us = pd.concat(parts)
print_cluster_dist(df_us)
df_us = pd.get_dummies(df_us)
target = 'price'
features = list(df_us.columns)
features.remove(target)
x_train, x_test, y_train, y_test = train_test_split(
df_us[features],
df_us[target],
random_state=42
)
x_train = x_train.astype(float) # prevent conversion warnings
y_pred = eval_model('CLUSTER-UNDERSAMP', x_train.columns, df_us)
auto_df = df.copy()
auto_df['auto_id'] = auto_df['price'].apply(lambda x: uuid.uuid1().int)
prices = auto_df['price']
auto_df.drop(['price'], axis=1, inplace=True, errors='ignore')
es = ft.EntitySet(id='airbnb')
es = es.entity_from_dataframe(
entity_id='main',
dataframe=auto_df,
index='auto_id'
)
# available_transform_primitives = ft.primitives.list_primitives()
# print(available_transform_primitives[available_transform_primitives['type'] == 'transform'])
features_df, feature_names = ft.dfs(
entityset=es,
target_entity='main',
trans_primitives=['subtract_numeric'],
max_depth=2
)
# print(features_df.columns)
auto_df = features_df.copy()
auto_df.reset_index()
auto_df.drop(['auto_id'], axis=1, inplace=True, errors='ignore')
auto_df = pd.get_dummies(auto_df)
print(auto_df.shape)
auto_features = list(auto_df.columns)
x_train, x_test, y_train, y_test = train_test_split(
auto_df,
prices,
random_state=42
)
x_train = x_train.astype(float) # prevent conversion warnings
y_pred = eval_model('AUTO-FT', auto_features, auto_df)